# [You have to change] set this following work directory to your file location
setwd("D:/Bioinformatics Projects/LindseyAanalysis/single_cell/two_brain_groups/")
library(Seurat)
## Attaching SeuratObject
## Attaching sp
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(cowplot)
library(ggplot2)
library(rstatix)
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
library(ComplexHeatmap)
## Loading required package: grid
## ========================================
## ComplexHeatmap version 2.12.0
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
##
## If you use it in published research, please cite:
## Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional
## genomic data. Bioinformatics 2016.
##
## The new InteractiveComplexHeatmap package can directly export static
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================
library(circlize)
## ========================================
## circlize version 0.4.15
## CRAN page: https://cran.r-project.org/package=circlize
## Github page: https://github.com/jokergoo/circlize
## Documentation: https://jokergoo.github.io/circlize_book/book/
##
## If you use it in published research, please cite:
## Gu, Z. circlize implements and enhances circular visualization
## in R. Bioinformatics 2014.
##
## This message can be suppressed by:
## suppressPackageStartupMessages(library(circlize))
## ========================================
library(SingleCellExperiment)
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
##
## count
##
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind, colnames,
## dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
## grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
## order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
## rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
## union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following object is masked from 'package:rstatix':
##
## desc
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:sp':
##
## %over%
## The following object is masked from 'package:grDevices':
##
## windows
## Loading required package: GenomeInfoDb
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
##
## Attaching package: 'SummarizedExperiment'
## The following object is masked from 'package:SeuratObject':
##
## Assays
## The following object is masked from 'package:Seurat':
##
## Assays
library(patchwork)
##
## Attaching package: 'patchwork'
## The following object is masked from 'package:cowplot':
##
## align_plots
library(SingleR)
library(celldex)
##
## Attaching package: 'celldex'
## The following objects are masked from 'package:SingleR':
##
## BlueprintEncodeData, DatabaseImmuneCellExpressionData,
## HumanPrimaryCellAtlasData, ImmGenData, MonacoImmuneData,
## MouseRNAseqData, NovershternHematopoieticData
wd <- list()
wd$data <- "D:/Bioinformatics Projects/LindseyAanalysis/single_cell/slim_counts/"
wd$output <- "D:/Bioinformatics Projects/LindseyAanalysis/single_cell/two_brain_groups/v8_output/"
# You might need to change the location "slim_counts/", to direct to your data folder
for (file in c("m09",
"m10",
"m11",
"m12")){
seurat_data <- Read10X(data.dir = paste0("D:/Bioinformatics Projects/LindseyAanalysis/single_cell/slim_counts/", file))
seurat_obj <- CreateSeuratObject(counts = seurat_data,
min.features = 100,
project = file)
assign(file, seurat_obj)
}
m09 <- AddMetaData(object = m09, metadata = "m09", col.name = "sample.id")
m10 <- AddMetaData(object = m10, metadata = "m10", col.name = "sample.id")
m11 <- AddMetaData(object = m11, metadata = "m11", col.name = "sample.id")
m12 <- AddMetaData(object = m12, metadata = "m12", col.name = "sample.id")
m09 <- AddMetaData(object = m09, metadata = "WT", col.name = "group.id")
m10 <- AddMetaData(object = m10, metadata = "WT", col.name = "group.id")
m11 <- AddMetaData(object = m11, metadata = "KO", col.name = "group.id")
m12 <- AddMetaData(object = m12, metadata = "KO", col.name = "group.id")
gallitano <- merge(m09, y = c(m10,m11,m12),
add.cell.ids = c("m09","m10","m11", "m12"),
project = "two_groups")
gallitano[["percent.mt"]] <- PercentageFeatureSet(gallitano, pattern = "^mt-")
rm(m09, m10, m11, m12)
rm(seurat_obj, seurat_data)
head(gallitano)
## orig.ident nCount_RNA nFeature_RNA sample.id group.id
## m09_AAACCCACAAAGACTA-1 m09 2042 993 m09 WT
## m09_AAACCCACAACCGATT-1 m09 13933 4279 m09 WT
## m09_AAACCCACACTGCGTG-1 m09 3407 2057 m09 WT
## m09_AAACCCATCGACATTG-1 m09 19957 5090 m09 WT
## m09_AAACCCATCTCGTTTA-1 m09 10917 3830 m09 WT
## m09_AAACGAAAGTACAGAT-1 m09 8316 3191 m09 WT
## m09_AAACGAAAGTCACGAG-1 m09 1004 620 m09 WT
## m09_AAACGAACAGAACATA-1 m09 15571 4718 m09 WT
## m09_AAACGAAGTGGTTTAC-1 m09 4293 1752 m09 WT
## m09_AAACGAATCAGTGATC-1 m09 44495 7519 m09 WT
## percent.mt
## m09_AAACCCACAAAGACTA-1 11.361410
## m09_AAACCCACAACCGATT-1 8.433216
## m09_AAACCCACACTGCGTG-1 6.750807
## m09_AAACCCATCGACATTG-1 11.564864
## m09_AAACCCATCTCGTTTA-1 6.531098
## m09_AAACGAAAGTACAGAT-1 4.882155
## m09_AAACGAAAGTCACGAG-1 15.637450
## m09_AAACGAACAGAACATA-1 12.080149
## m09_AAACGAAGTGGTTTAC-1 16.887957
## m09_AAACGAATCAGTGATC-1 3.577930
tail(gallitano)
## orig.ident nCount_RNA nFeature_RNA sample.id group.id
## m12_TTTGGTTAGGTTCATC-1 m12 1290 639 m12 KO
## m12_TTTGGTTCACCAGCGT-1 m12 5415 2483 m12 KO
## m12_TTTGGTTCAGGTATGG-1 m12 6148 2765 m12 KO
## m12_TTTGGTTGTCCTGTTC-1 m12 542 396 m12 KO
## m12_TTTGGTTGTGAGGCAT-1 m12 618 459 m12 KO
## m12_TTTGTTGAGGAGCTGT-1 m12 1841 1056 m12 KO
## m12_TTTGTTGCAAGTACCT-1 m12 4701 2242 m12 KO
## m12_TTTGTTGGTAGGACCA-1 m12 2935 1647 m12 KO
## m12_TTTGTTGTCCGACGGT-1 m12 13358 2314 m12 KO
## m12_TTTGTTGTCTTCTAAC-1 m12 7137 3542 m12 KO
## percent.mt
## m12_TTTGGTTAGGTTCATC-1 29.302326
## m12_TTTGGTTCACCAGCGT-1 2.271468
## m12_TTTGGTTCAGGTATGG-1 2.569941
## m12_TTTGGTTGTCCTGTTC-1 6.457565
## m12_TTTGGTTGTGAGGCAT-1 1.456311
## m12_TTTGTTGAGGAGCTGT-1 4.454101
## m12_TTTGTTGCAAGTACCT-1 3.531164
## m12_TTTGTTGGTAGGACCA-1 1.635434
## m12_TTTGTTGTCCGACGGT-1 1.235215
## m12_TTTGTTGTCTTCTAAC-1 2.101723
table(gallitano@meta.data$sample.id)
##
## m09 m10 m11 m12
## 5194 15772 9932 4357
# m09 m10 m11 m12
# 5194 15772 9932 4357
table(gallitano@meta.data$group.id)
##
## KO WT
## 14289 20966
# KO WT
#14289 20966
VlnPlot(gallitano,
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
group.by = "sample.id",
ncol = 1) + ggtitle("Quality assessment before filtering")
plot1 <- FeatureScatter(gallitano, feature1 = "nCount_RNA", feature2 = "percent.mt")
line.data <- data.frame(yintercept = c(1500, 7500), Lines = c("lower", "upper"))
line2.data <- data.frame(xintercept = c(500), Lines = c("lower"))
plot2 <- FeatureScatter(gallitano, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") + geom_hline(aes(yintercept = yintercept, linetype = Lines), line.data) + geom_vline(aes(xintercept = xintercept, linetype = Lines), line2.data)
plot1 + plot2
# joint filtering effect
# Visualize the correlation between genes detected and number of UMIs and determine whether strong presence of cells with low numbers of genes/UMIs
metadata <- gallitano@meta.data
metadata %>%
ggplot(aes(x=nCount_RNA, y=nFeature_RNA, color=percent.mt)) +
geom_point() +
scale_colour_gradient(low = "gray90", high = "black") +
stat_smooth(method=lm) +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = 500) +
geom_hline(yintercept = c(1500, 7500))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
# After subseting, I also remove the genes with zero count in more than
100 cells
gallitano <- subset(gallitano, nFeature_RNA < 7500 &
nFeature_RNA>1500 & nCount_RNA > 2000 &
percent.mt < 20)
# Output a logical vector for every gene on whether the more than zero counts per cell
# Extract counts
counts <- GetAssayData(object = gallitano, slot = "counts")
# Output a logical vector for every gene on whether there more than zero counts per cell
nonzero <- counts > 0
# Sums all TRUE values and returns TRUE if more than 100 TRUE values per gene
keep_genes <- Matrix::rowSums(nonzero) >= 100
# Only keeping those genes expressed in more than 100 cells
filtered_counts <- counts[keep_genes, ]
# Reassign to filtered Seurat object
gallitano <- CreateSeuratObject(filtered_counts, meta.data = gallitano@meta.data)
dim(gallitano)
## [1] 14377 11424
# 14377 genes x 11424 cells (or UMIs)
table(gallitano@meta.data$sample.id)
##
## m09 m10 m11 m12
## 1966 3633 3896 1929
# m09 m10 m11 m12
# 1966 3633 3896 1929
VlnPlot(gallitano,
features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
ncol = 1) + ggtitle("Quality assessment after filtering")
plot1 <- FeatureScatter(gallitano, feature1 = "nCount_RNA", feature2 = "percent.mt")
line.data <- data.frame(yintercept = c(1500, 7500), Lines = c("lower", "upper"))
line2.data <- data.frame(xintercept = c(500), Lines = c("lower"))
plot2 <- FeatureScatter(gallitano, feature1 = "nCount_RNA", feature2 = "nFeature_RNA") + geom_hline(aes(yintercept = yintercept, linetype = Lines), line.data) + geom_vline(aes(xintercept = xintercept, linetype = Lines), line2.data)
plot1 + plot2
metadata <- gallitano@meta.data
metadata %>%
ggplot(aes(x=nCount_RNA, y=nFeature_RNA, color=percent.mt)) +
geom_point() +
scale_colour_gradient(low = "gray90", high = "black") +
stat_smooth(method=lm) +
scale_x_log10() +
scale_y_log10() +
theme_classic() +
geom_vline(xintercept = 500) +
geom_hline(yintercept = c(1500, 7500))
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
options(future.globals.maxSize = 9000 * 1024^2) #100 gb
# Split seurat object by condition to perform cell cycle scoring and SCT on all samples
split_seurat <- SplitObject(gallitano, split.by = "sample.id")
split_seurat <- split_seurat[c("m09","m10", "m11", "m12")]
for (i in 1:length(split_seurat)) {
split_seurat[[i]] <- NormalizeData(split_seurat[[i]], verbose = TRUE)
split_seurat[[i]] <- SCTransform(split_seurat[[i]],method = "glmGamPoi", vars.to.regress = c("percent.mt"))
}
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 14376 by 1966
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 1966 cells
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
## Found 43 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 14376 genes
##
|
| | 0%
|
|== | 3%
|
|===== | 7%
|
|======= | 10%
|
|========== | 14%
|
|============ | 17%
|
|============== | 21%
|
|================= | 24%
|
|=================== | 28%
|
|====================== | 31%
|
|======================== | 34%
|
|=========================== | 38%
|
|============================= | 41%
|
|=============================== | 45%
|
|================================== | 48%
|
|==================================== | 52%
|
|======================================= | 55%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================== | 66%
|
|================================================ | 69%
|
|=================================================== | 72%
|
|===================================================== | 76%
|
|======================================================== | 79%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Computing corrected count matrix for 14376 genes
##
|
| | 0%
|
|== | 3%
|
|===== | 7%
|
|======= | 10%
|
|========== | 14%
|
|============ | 17%
|
|============== | 21%
|
|================= | 24%
|
|=================== | 28%
|
|====================== | 31%
|
|======================== | 34%
|
|=========================== | 38%
|
|============================= | 41%
|
|=============================== | 45%
|
|================================== | 48%
|
|==================================== | 52%
|
|======================================= | 55%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================== | 66%
|
|================================================ | 69%
|
|=================================================== | 72%
|
|===================================================== | 76%
|
|======================================================== | 79%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Calculating gene attributes
## Wall clock passed: Time difference of 35.23748 secs
## Determine variable features
## Place corrected count matrix in counts slot
## Regressing out percent.mt
## Centering data matrix
## Set default assay to SCT
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 14377 by 3633
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 3633 cells
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
## Found 48 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 14377 genes
##
|
| | 0%
|
|== | 3%
|
|===== | 7%
|
|======= | 10%
|
|========== | 14%
|
|============ | 17%
|
|============== | 21%
|
|================= | 24%
|
|=================== | 28%
|
|====================== | 31%
|
|======================== | 34%
|
|=========================== | 38%
|
|============================= | 41%
|
|=============================== | 45%
|
|================================== | 48%
|
|==================================== | 52%
|
|======================================= | 55%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================== | 66%
|
|================================================ | 69%
|
|=================================================== | 72%
|
|===================================================== | 76%
|
|======================================================== | 79%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Computing corrected count matrix for 14377 genes
##
|
| | 0%
|
|== | 3%
|
|===== | 7%
|
|======= | 10%
|
|========== | 14%
|
|============ | 17%
|
|============== | 21%
|
|================= | 24%
|
|=================== | 28%
|
|====================== | 31%
|
|======================== | 34%
|
|=========================== | 38%
|
|============================= | 41%
|
|=============================== | 45%
|
|================================== | 48%
|
|==================================== | 52%
|
|======================================= | 55%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================== | 66%
|
|================================================ | 69%
|
|=================================================== | 72%
|
|===================================================== | 76%
|
|======================================================== | 79%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Calculating gene attributes
## Wall clock passed: Time difference of 58.54576 secs
## Determine variable features
## Place corrected count matrix in counts slot
## Regressing out percent.mt
## Centering data matrix
## Set default assay to SCT
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 14377 by 3896
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 3896 cells
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
## Found 74 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 14377 genes
##
|
| | 0%
|
|== | 3%
|
|===== | 7%
|
|======= | 10%
|
|========== | 14%
|
|============ | 17%
|
|============== | 21%
|
|================= | 24%
|
|=================== | 28%
|
|====================== | 31%
|
|======================== | 34%
|
|=========================== | 38%
|
|============================= | 41%
|
|=============================== | 45%
|
|================================== | 48%
|
|==================================== | 52%
|
|======================================= | 55%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================== | 66%
|
|================================================ | 69%
|
|=================================================== | 72%
|
|===================================================== | 76%
|
|======================================================== | 79%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Computing corrected count matrix for 14377 genes
##
|
| | 0%
|
|== | 3%
|
|===== | 7%
|
|======= | 10%
|
|========== | 14%
|
|============ | 17%
|
|============== | 21%
|
|================= | 24%
|
|=================== | 28%
|
|====================== | 31%
|
|======================== | 34%
|
|=========================== | 38%
|
|============================= | 41%
|
|=============================== | 45%
|
|================================== | 48%
|
|==================================== | 52%
|
|======================================= | 55%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================== | 66%
|
|================================================ | 69%
|
|=================================================== | 72%
|
|===================================================== | 76%
|
|======================================================== | 79%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Calculating gene attributes
## Wall clock passed: Time difference of 1.096737 mins
## Determine variable features
## Place corrected count matrix in counts slot
## Regressing out percent.mt
## Centering data matrix
## Set default assay to SCT
## Calculating cell attributes from input UMI matrix: log_umi
## Variance stabilizing transformation of count matrix of size 14377 by 1929
## Model formula is y ~ log_umi
## Get Negative Binomial regression parameters per gene
## Using 2000 genes, 1929 cells
##
|
| | 0%
|
|================== | 25%
|
|=================================== | 50%
|
|==================================================== | 75%
|
|======================================================================| 100%
## Found 56 outliers - those will be ignored in fitting/regularization step
## Second step: Get residuals using fitted parameters for 14377 genes
##
|
| | 0%
|
|== | 3%
|
|===== | 7%
|
|======= | 10%
|
|========== | 14%
|
|============ | 17%
|
|============== | 21%
|
|================= | 24%
|
|=================== | 28%
|
|====================== | 31%
|
|======================== | 34%
|
|=========================== | 38%
|
|============================= | 41%
|
|=============================== | 45%
|
|================================== | 48%
|
|==================================== | 52%
|
|======================================= | 55%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================== | 66%
|
|================================================ | 69%
|
|=================================================== | 72%
|
|===================================================== | 76%
|
|======================================================== | 79%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Computing corrected count matrix for 14377 genes
##
|
| | 0%
|
|== | 3%
|
|===== | 7%
|
|======= | 10%
|
|========== | 14%
|
|============ | 17%
|
|============== | 21%
|
|================= | 24%
|
|=================== | 28%
|
|====================== | 31%
|
|======================== | 34%
|
|=========================== | 38%
|
|============================= | 41%
|
|=============================== | 45%
|
|================================== | 48%
|
|==================================== | 52%
|
|======================================= | 55%
|
|========================================= | 59%
|
|=========================================== | 62%
|
|============================================== | 66%
|
|================================================ | 69%
|
|=================================================== | 72%
|
|===================================================== | 76%
|
|======================================================== | 79%
|
|========================================================== | 83%
|
|============================================================ | 86%
|
|=============================================================== | 90%
|
|================================================================= | 93%
|
|==================================================================== | 97%
|
|======================================================================| 100%
## Calculating gene attributes
## Wall clock passed: Time difference of 31.74446 secs
## Determine variable features
## Place corrected count matrix in counts slot
## Regressing out percent.mt
## Centering data matrix
## Set default assay to SCT
rm(gallitano)
# Select the most variable features to use for integration
integ_features <- SelectIntegrationFeatures(object.list = split_seurat,
nfeatures = 3000)
# Prepare the SCT list object for integration
split_seurat <- PrepSCTIntegration(object.list = split_seurat,
anchor.features = integ_features)
# Find best buddies - can take a while to run
integ_anchors <- FindIntegrationAnchors(object.list = split_seurat,
normalization.method = "SCT",
anchor.features = integ_features)
## Finding all pairwise anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 5438 anchors
## Filtering anchors
## Retained 5090 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 6101 anchors
## Filtering anchors
## Retained 5797 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 7857 anchors
## Filtering anchors
## Retained 7257 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 4657 anchors
## Filtering anchors
## Retained 4571 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 5532 anchors
## Filtering anchors
## Retained 5454 anchors
## Running CCA
## Merging objects
## Finding neighborhoods
## Finding anchors
## Found 5960 anchors
## Filtering anchors
## Retained 5864 anchors
# Integrate across conditions
seurat_integrated <- IntegrateData(anchorset = integ_anchors,
normalization.method = "SCT")
## Merging dataset 4 into 3
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Merging dataset 1 into 2
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
## Merging dataset 2 1 into 3 4
## Extracting anchors for merged samples
## Finding integration vectors
## Finding integration vector weights
## Integrating data
rm(integ_anchors)
rm(integ_features)
rm(split_seurat)
# Run the standard workflow for visualization and clustering
seurat_integrated <- RunPCA(object = seurat_integrated, npcs = 40, verbose = F)
ElbowPlot(seurat_integrated, ndims = 40)
PCAPlot(seurat_integrated)
seurat_integrated <- RunUMAP(seurat_integrated,
dims = 1:40,
reduction = "pca")
## Warning: The default method for RunUMAP has changed from calling Python UMAP via reticulate to the R-native UWOT using the cosine metric
## To use Python UMAP via reticulate, set umap.method to 'umap-learn' and metric to 'correlation'
## This message will be shown once per session
## 13:40:43 UMAP embedding parameters a = 0.9922 b = 1.112
## 13:40:44 Read 11424 rows and found 40 numeric columns
## 13:40:44 Using Annoy for neighbor search, n_neighbors = 30
## 13:40:44 Building Annoy index with metric = cosine, n_trees = 50
## 0% 10 20 30 40 50 60 70 80 90 100%
## [----|----|----|----|----|----|----|----|----|----|
## **************************************************|
## 13:40:47 Writing NN index file to temp file C:\Users\Admin\AppData\Local\Temp\Rtmpe6F2eQ\file3edc7a0f2236
## 13:40:47 Searching Annoy index using 1 thread, search_k = 3000
## 13:40:50 Annoy recall = 100%
## 13:40:52 Commencing smooth kNN distance calibration using 1 thread
## 13:40:55 Initializing from normalized Laplacian + noise
## 13:40:56 Commencing optimization for 200 epochs, with 476818 positive edges
## 13:41:12 Optimization finished
seurat_integrated <- RunTSNE(seurat_integrated,
reduction = "pca")
# Determine the K-nearest neighbor graph
seurat_integrated <- FindNeighbors(object = seurat_integrated,
dims = 1:40)
## Computing nearest neighbor graph
## Computing SNN
# Determine the clusters for various resolutions
seurat_integrated <- FindClusters(object = seurat_integrated,
resolution = c(0.05, 0.5))
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 11424
## Number of edges: 399100
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9858
## Number of communities: 12
## Elapsed time: 1 seconds
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 11424
## Number of edges: 399100
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9497
## Number of communities: 25
## Elapsed time: 1 seconds
#setwd("D:/Bioinformatics Projects/LindseyAanalysis/single_cell/two_brain_groups")
#saveRDS(seurat_integrated, "./v8_output/Seurat_integrated_twobraingroups.RDS")
This step gives us some idea about how is the distribution of the number of genes, number of UMIs, and percentage of mitochondrial genes in each cluster. Normally, we expect to see similar distribution of no. of genes (nFeature_RNA) and no. of UMIs (nCount_RNA).
As for the percent.mt (percentage of mitochondrial genes per cell), it can be a reference to check if those high intensity clusters might be having poor quality cells (if so, we can try to remove in the next step or adjust the metrics in the previous filtering step) or it might be due to the differences biologically
# Determine metrics to plot present in seurat_integrated@meta.data
metrics <- c("nFeature_RNA", "nCount_RNA", "percent.mt")
FeaturePlot(seurat_integrated,
reduction = "umap",
features = metrics,
split.by = "sample.id",
pt.size = 0.4,
min.cutoff = 'q10',
label = TRUE)
Idents(object = seurat_integrated) <- seurat_integrated$integrated_snn_res.0.05
# Idents(object = seurat_integrated) <- seurat_integrated$sample.id
# Idents(object = seurat_integrated) <- seurat_integrated$group.id
DimPlot(seurat_integrated, reduction = "tsne", label = TRUE, pt.size = 1, label.size = 6)
DimPlot(seurat_integrated, reduction = "umap", label = TRUE, pt.size = 2, label.size = 6)
PCAPlot(seurat_integrated)
# do feature plots for selected genes
select_genes = c('Trem2','Snap25','Epas1', 'Egr1', 'Npas4', 'Calcrl')
FeaturePlot(seurat_integrated, features =select_genes, cols =c('gray','red'), reduction ='tsne', pt.size = 0.7,ncol = 3, label = T)
FeaturePlot(seurat_integrated, features =select_genes, cols =c('gray','red'), reduction = 'umap',pt.size = 0.7,ncol=3, label = T)
# do heatmaps
select_genes2 <- c('Snap25','Gad1','Gad2', 'Slc32a1', 'Slc17a7', 'Lamp5', 'Aqp4', 'Gfap', "Ndnf", 'Sncg', 'Vip', 'Trem2','Sst','Pvalb', 'Cux2', 'Rorb', 'Fezf2', 'Foxp2', 'Npas4', 'Mbp', 'Cldn5', 'Ctss', 'C1qa')
cellRanks <- seurat_integrated@meta.data[order(seurat_integrated@meta.data$integrated_snn_res.0.05),] # check cellRanks structure!
PartialMatrix <- seurat_integrated@assays$integrated@scale.data[which(rownames(seurat_integrated@assays$integrated@scale.data) %in% select_genes2), rownames(cellRanks)]
cellRanks$col <- rainbow(max(as.numeric(cellRanks$integrated_snn_res.0.05))+1)[as.numeric(cellRanks$integrated_snn_res.0.5)+1]
#
ha_column <- HeatmapAnnotation(
df = data.frame(
ClusterID = as.numeric(cellRanks$integrated_snn_res.0.05)
),
col = list(
ClusterID = colorRamp2(unique(as.numeric(cellRanks$integrated_snn_res.0.05)),
rainbow(max(as.numeric(cellRanks$integrated_snn_res.0.05))))
)
)
ht1 = Heatmap(PartialMatrix, name = "Scaled\nUMI", column_title = "Allen SMARTseq dataset genes",
top_annotation = ha_column, show_column_names=FALSE, cluster_columns = FALSE,
row_names_gp = gpar(fontsize = 16))
## `use_raster` is automatically set to TRUE for a matrix with more than
## 2000 columns You can control `use_raster` argument by explicitly
## setting TRUE/FALSE to it.
##
## Set `ht_opt$message = FALSE` to turn off this message.
## 'magick' package is suggested to install to give better rasterization.
##
## Set `ht_opt$message = FALSE` to turn off this message.
# do not use TRUE for clustering, too many cells cannot see clearly
ht1
# ht1
# heatmap, without gene name cluster, adopt listed gene sequences
DoHeatmap(seurat_integrated, features = select_genes2)
DoHeatmap(seurat_integrated, features = select_genes2, group.by = 'group.id' )
#DoHeatmap(seurat_integrated, cells = 1:100, features = select_genes2, group.by = 'group.id' )
DoHeatmap(seurat_integrated, features = select_genes2, group.by = 'seurat_clusters' )
# additiontal plots, adjusting format
Idents(object = seurat_integrated) <- seurat_integrated$integrated_snn_res.0.5
DimPlot(seurat_integrated, label = T, pt.size = 1, label.size = 6)
plot <- VlnPlot(seurat_integrated, assay = "RNA", features = c("Gad1","Trem2"), combine = FALSE, fill.by = c("feature","ident"))
plot <- lapply(
X = plot,
FUN = function(p) p + aes(color= seurat_integrated$integrated_snn_res.0.5)
)
# plot$layers[[2]]$aes_params$alpha <- 0.1
CombinePlots(plots = plot, legend = 'none')
## Warning: CombinePlots is being deprecated. Plots should now be combined using
## the patchwork system.
#p0 <- VlnPlot(seurat_integrated, "Gad1", assay = "RNA", pt.size = 0.2)
#p1 <- VlnPlot(seurat_integrated, "Gad1", assay = "RNA", )
#p1$layers[[2]]$aes_params$alpha <- 0.1
#p0+p1
Idents(object = seurat_integrated) <- seurat_integrated$integrated_snn_res.0.05
# or for finer clusters:
# Idents(object = seurat_integrated) <- seurat_integrated$integrated_snn_res.0.5
DoHeatmap(seurat_integrated, features = select_genes2, disp.min = -1,
slot = 'scale.data',
group.colors = rainbow(9)) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red",midpoint = 1)
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
mapal <- colorRampPalette(RColorBrewer::brewer.pal(11,"RdBu"))(256)
DoHeatmap(seurat_integrated, features = select_genes2, angle = 90, size = 3) + scale_fill_gradientn(colours = rev(mapal))
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
# trial DEGs and GSEA analysis for a cluster. Note: the following block
can run, but it takes long time and i am not sure it means anythig…so
let’s skip this block.
# Idents(object = seurat_integrated) <- seurat_integrated$integrated_snn_res.0.05
# PG.markers.7 <- FindMarkers(seurat_integrated, ident.1 = "0", min.pct = 0.20) # we will adopt the variable name .7, but to represent cluster 0.
#PG.markers.7$gene <- rownames(PG.markers.7)
#library(biomaRt)
#gene_id <- getBM(attributes = c("ensembl_gene_id","external_gene_name","entrezgene_id"), mart = useDataset("mmusculus_gene_ensembl", useMart("ensembl"))) # takes longer time
#PG.markers.7 <- merge(PG.markers.7, gene_id[,c(2,3)], by.x = "gene", by.y = "external_gene_name")
#PG.markers.7.filt <- PG.markers.7
#PG.markers.7.filt <- PG.markers.7.filt[!duplicated(PG.markers.7.filt$entrezgene_id),]
#PG.markers.7.filt <- PG.markers.7[which(PG.markers.7$p_val_adj < 0.05),]
#genelist <- PG.markers.7$avg_log2FC
#names(genelist) <- PG.markers.7$entrezgene_id
#genelist <- sort(genelist, decreasing = TRUE)
#library(fgsea)
#library(msigdbr)
#library(data.table)
#library(tidyverse)
#m_df <- msigdbr(species = "Mus musculus", category = "C2") %>% dplyr::select(gs_name, entrez_gene)
#m_df_fgsea <- split(x = m_df$entrez_gene, f = m_df$gs_name)
#fgseaRes <- fgsea(pathways = m_df_fgsea,
#stats = genelist,
#eps = 0.05,
#minSize = 15,
#maxSize = 800) # takes longer ~40min for cluster0 genes
#selected <- fgseaRes[c(979, 329, 443, 445, 907),]
#selected <- selected$pathway
#png(file = "./v8_output/fgsea table.png", width = 800, height = 500)
#plotGseaTable(m_df_fgsea[selected], genelist, fgseaRes,
# gseaParam=0.5)
#dev.off()
Idents(seurat_integrated)<- "seurat_clusters"
cluster1.markers <- FindMarkers(seurat_integrated, ident.1 = 1, min.pct = 0.25, only.pos = TRUE) #compare cluster1 with all the rest of clusters
head(cluster1.markers, n = 5)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Apoe 0 3.822062 1.000 0.196 0
## Aldoc 0 5.758969 0.998 0.102 0
## Cst3 0 3.490138 0.999 0.152 0
## Atp1a2 0 3.423945 1.000 0.160 0
## Mt1 0 5.716655 0.997 0.137 0
VlnPlot(seurat_integrated, assay = "RNA", features = row.names(cluster1.markers)[1:15], pt.size = 0, stack = TRUE, flip = TRUE)
VlnPlot(seurat_integrated, assay = "RNA", features = row.names(cluster1.markers)[1:5], stack = TRUE, flip = TRUE, split.by = "group.id", split.plot = TRUE)
## The default behaviour of split.by has changed.
## Separate violin plots are now plotted side-by-side.
## To restore the old behaviour of a single split violin,
## set split.plot = TRUE.
##
## This message will be shown once per session.
#p1 <- VlnPlot(seurat_integrated, assay = "RNA", features = row.names(cluster1.markers)[1:5], pt.size = 0, stack = TRUE, flip = TRUE)
## p1$layers[[1]]$aes_params$alpha <- 0.1 #does not work
#p1
cluster2.markers <- FindMarkers(seurat_integrated, ident.1 = 2, ident.2 = c(3,4), min.pct = 0.25, only.pos = TRUE) #compare cluster2 with clusters 3 and 4
VlnPlot(seurat_integrated, assay = "RNA", features = row.names(cluster2.markers)[1:15], stack = TRUE, flip = TRUE)
# for all markers-higher resolution
Qiu.markers <- FindAllMarkers(seurat_integrated,
only.pos=TRUE, min.pct=0.25, logfc.threshold=0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
## Calculating cluster 15
## Calculating cluster 16
## Calculating cluster 17
## Calculating cluster 18
## Calculating cluster 19
## Calculating cluster 20
## Calculating cluster 21
## Calculating cluster 22
## Calculating cluster 23
## Calculating cluster 24
write.csv(Qiu.markers,paste0(wd$output,"Qiu_all_markers_20220226.csv"))
head(Qiu.markers,10)
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## Snap25 0.000000e+00 3.2560060 0.821 0.275 0.000000e+00 0 Snap25
## Nrgn 0.000000e+00 3.0582446 0.714 0.247 0.000000e+00 0 Nrgn
## Ywhaz 0.000000e+00 2.8312954 0.801 0.246 0.000000e+00 0 Ywhaz
## Ywhah 0.000000e+00 2.7537510 0.804 0.310 0.000000e+00 0 Ywhah
## Calm2 0.000000e+00 2.0530944 0.787 0.304 0.000000e+00 0 Calm2
## Camk2n1 0.000000e+00 1.8180151 0.712 0.250 0.000000e+00 0 Camk2n1
## Cck 0.000000e+00 0.8036294 0.702 0.220 0.000000e+00 0 Cck
## Basp1 4.240489e-258 1.6305991 0.711 0.312 1.272147e-254 0 Basp1
## Ppp3ca 1.347563e-252 2.0267032 0.708 0.303 4.042689e-249 0 Ppp3ca
## Chn1 9.230218e-252 2.2142401 0.714 0.330 2.769065e-248 0 Chn1
y <- Qiu.markers %>% group_by(cluster) %>% top_n(n = 1, wt = avg_log2FC)
VlnPlot(seurat_integrated, assay = "RNA", features = y$gene, stack = TRUE)
v3 <- VlnPlot(seurat_integrated, assay = "RNA", features = "Foxp2")
#v3 <- VlnPlot(seurat_integrated, assay = "RNA", features = c("Foxp2", "Kcnab3"))
v3$layers[[2]]$aes_params$alpha <- 0.1
v3
# VlnPlot(seurat_integrated, features = y$gene[1:10], stack = TRUE)
FeaturePlot(seurat_integrated, features = y$gene[1:9]) # top gene marker for the first 9 cluster
FeaturePlot(seurat_integrated, features = y$gene[1:2], blend = TRUE)
top10 <- Qiu.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
DoHeatmap(seurat_integrated, features = top10$gene) + NoLegend()
# repeat for lower resolution clusters
Idents(object = seurat_integrated) <- seurat_integrated$integrated_snn_res.0.05
cluster1.markers.low.res <- FindMarkers(seurat_integrated, ident.1 = 1, min.pct = 0.25, only.pos = TRUE) #compare cluster1 with all the rest of clusters
head(cluster1.markers.low.res, n = 5)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Meg3 0 2.9960808 0.735 0.310 0
## Snhg11 0 2.2490148 0.830 0.314 0
## Slc17a7 0 0.6629423 0.786 0.310 0
## R3hdm1 0 8.1347943 0.735 0.252 0
## Atp2b2 0 5.4205087 0.783 0.288 0
VlnPlot(seurat_integrated, assay="RNA", features = row.names(cluster1.markers.low.res)[1:25], pt.size = 0, stack = TRUE, flip = TRUE)
cluster2.markers.low.res <- FindMarkers(seurat_integrated, ident.1 = 2, ident.2 = c(0,4), min.pct = 0.25, only.pos = TRUE) #compare cluster2 with clusters 0 and 4
head(cluster2.markers.low.res)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## Apoe 0 4.620273 1.000 0.120 0
## Aldoc 0 5.749845 0.998 0.129 0
## Cst3 0 5.363414 0.998 0.089 0
## Atp1a2 0 5.673813 0.999 0.092 0
## Mt1 0 5.715023 0.997 0.114 0
## Plpp3 0 7.071290 0.999 0.095 0
VlnPlot(seurat_integrated, assay = "RNA", features = row.names(cluster2.markers.low.res)[1:15], stack = TRUE, flip = TRUE)
VlnPlot(seurat_integrated, assay = "RNA", features = row.names(cluster2.markers.low.res)[1:15], stack = TRUE, flip = TRUE, split.by = "group.id", split.plot = TRUE)
# repeat for lower resolution clustering
Qiu.markers.low.res <- FindAllMarkers(seurat_integrated,
only.pos=TRUE, min.pct=0.25, logfc.threshold=0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
write.csv(Qiu.markers.low.res,paste0(wd$output,"Qiu_all_markers_low_res.csv"))
head(Qiu.markers.low.res,10)
## p_val avg_log2FC pct.1 pct.2 p_val_adj cluster gene
## Camk2n1 0 6.603516 0.670 0.167 0 0 Camk2n1
## Itpka 0 6.012532 0.497 0.163 0 0 Itpka
## Snap25 0 5.284015 0.793 0.168 0 0 Snap25
## Nrgn 0 5.172864 0.726 0.140 0 0 Nrgn
## Nrn1 0 5.054037 0.722 0.159 0 0 Nrn1
## Lmo4 0 4.579140 0.694 0.196 0 0 Lmo4
## Map1b 0 4.499258 0.675 0.272 0 0 Map1b
## Hpca 0 4.489036 0.661 0.203 0 0 Hpca
## Snca 0 4.299523 0.753 0.190 0 0 Snca
## Nrsn1 0 4.278136 0.551 0.206 0 0 Nrsn1
y <- Qiu.markers.low.res %>% group_by(cluster) %>% top_n(n = 1, wt = avg_log2FC)
v2 <- VlnPlot(seurat_integrated, assay = "RNA", features = y$gene, stack = TRUE, flip = TRUE)
v2
v3 <- VlnPlot(seurat_integrated, assay = "RNA", features = c("Foxp2", "Snap25"))
v3$layers[[2]]$aes_params$alpha <- 0.1
v3
# VlnPlot(seurat_integrated, features = y$gene[1:10], stack = TRUE)
v4 <- FeaturePlot(seurat_integrated, features = y$gene[1:9]) # top gene marker for the first 9 cluster
v4
top10 <- Qiu.markers.low.res %>% group_by(cluster) %>% top_n(n = 10, wt = avg_log2FC)
v5 <- DoHeatmap(seurat_integrated, features = top10$gene) + NoLegend()
v5
I use a collection of mouse bulk RNA-seq data sets obtained from celldex package (Benayoun et al. 2019). A variety of cell types are available, mostly from blood but also covering several other tissues. This identifies marker genes from the reference and uses them to compute assignment scores (based on the Spearman correlation across markers) for each cell in the test dataset against each label in the reference. The label with the highest score is the assigned to the test cell, possibly with further fine-tuning to resolve closely related labels.
This reference consists of a collection of mouse bulk RNA-seq data sets downloaded from the gene expression omnibus (Benayoun et al. 2019). A variety of cell types are available, again mostly from blood but also covering several other tissues.
#BiocManager::install("SingleR")
#BiocManager::install("celldex")
Idents(seurat_integrated)<- "seurat_clusters"
ref <- MouseRNAseqData()
## snapshotDate(): 2022-04-26
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
## see ?celldex and browseVignettes('celldex') for documentation
## loading from cache
table(Idents(object = seurat_integrated))
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 1599 1399 1120 691 662 625 621 543 456 435 427 395 390 373 297 198
## 16 17 18 19 20 21 22 23 24
## 194 174 174 160 159 109 78 74 71
hpca.se <- SingleR(test = seurat_integrated@assays$RNA@data, ref = ref,
labels = ref$label.main)
p1 <- plotScoreHeatmap(hpca.se)
p1
plotDeltaDistribution(hpca.se, ncol = 3)
## Warning: Groups with fewer than two data points have been dropped.
## Warning: The following aesthetics were dropped during statistical transformation: x, y,
## PANEL, group
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
## Warning in max(data$density): no non-missing arguments to max; returning -Inf
## Warning: Computation failed in `stat_ydensity()`
## Caused by error in `$<-.data.frame`:
## ! replacement has 1 row, data has 0
all.markers <- metadata(hpca.se)$de.genes
seurat_integrated$labels_hpca <- hpca.se$labels
# Extract identity and sample information from seurat object to determine the number of cells per cluster per sample
n_cells <- FetchData(seurat_integrated,
vars = c("integrated_snn_res.0.5",
"labels_hpca")) %>%
dplyr::count(integrated_snn_res.0.5,
labels_hpca) %>%
tidyr::spread(integrated_snn_res.0.5, n)
# View(n_cells)
write.csv(n_cells, paste0(wd$output,"n_cells_mouseRNAseqData_twobraingroups.csv"))
## comparing with the unsupervised clustering
tab <- table(cluster= seurat_integrated$integrated_snn_res.0.5, label= hpca.se$labels)
p3 <- pheatmap::pheatmap(log10(tab+10))
Idents(object = seurat_integrated) <- seurat_integrated$integrated_snn_res.0.5
Plotting Astrocyte Markers
VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Aqp4","Prdx6","Pla2g7"))
Plotting Microglia Markers
VlnPlot(object = seurat_integrated, assay = "RNA", features =c("C1qc","Ly86", "Tmem119"))
Plotting Endothelial Markers
VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Ly6a", "Ly6c1", "Pltp"))
Plotting Oligodendrocyte Markers
VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Mag", "Mbp", "Cldn11"))
Plotting Glutamatergic Neuron Markers
VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Nrgn", "Sv2b", "Snap25"))
Plotting Gabaergic Neuron Markers
VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Gad1", "Gad2", "Rab3b" ))
Plotting Oligodendrocyte Precursor Markers
VlnPlot(object = seurat_integrated, assay = "RNA", features =c("Pdgfra", "Cspg4", "Gm2a"))
seurat_integrated <- RenameIdents(object = seurat_integrated,
"0" = "Neurons(Glutamatergic)",
"1" = "Astrocytes",
"2" = "Endothelial cells",
"3" = "Neurons(Glutamatergic)",
"4" = "Microglia",
"5" = "Neurons(Glutamatergic)",
"6" = "Oligodendrocytes",
"7" = "Neurons(Glutamatergic)",
"8" = "Neurons(Glutamatergic)",
"9" = "OPC_precursor",
"10" = "Neurons(Gabaergic)",
"11" = "Neurons(Glutamatergic)",
"12" = "Neurons(Gabaergic)",
"13" = "Neurons(Glutamatergic)",
"14" = "Neurons(Glutamatergic)",
"15" = "Endothelial cells",
"16" = "Endothelial cells",
"17" = "Neurons(Glutamatergic)",
"18" = "Astrocytes",
"19" = "Neurons(Glutamatergic)",
"20"= "Astrocytes",
"21"= "Neurons(Glutamatergic)",
"22"= "Oligodendrocytes",
"23"= "Endothelial cells",
"24"= "Endothelial cells" )
seurat_integrated$celltype_LC <- seurat_integrated@active.ident
DimPlot(seurat_integrated,
reduction = "umap",
group.by = "integrated_snn_res.0.5",
label = TRUE,
)
DimPlot(seurat_integrated,
reduction = "umap",
group.by = "integrated_snn_res.0.5",
split.by = "sample.id",
label = TRUE,
)
Idents(object = seurat_integrated) <- "celltype_LC"
plot <- DimPlot(seurat_integrated,
reduction = "umap",
group.by = "celltype_LC",
label = FALSE,
)
plot$data$seurat_clusters <- seurat_integrated@meta.data$integrated_snn_res.0.5
all(rownames(plot$data) == rownames(seurat_integrated@meta.data))
## [1] TRUE
LabelClusters(plot, id = "seurat_clusters")
Idents(object = seurat_integrated) <- "celltype_LC"
plot <- DimPlot(seurat_integrated,
reduction = "umap",
group.by = "celltype_LC",
split.by = "sample.id",
label = FALSE,
) #split.by = "group.id"
plot$data$seurat_clusters <- seurat_integrated@meta.data$integrated_snn_res.0.5
all(rownames(plot$data) == rownames(seurat_integrated@meta.data))
## [1] TRUE
LabelClusters(plot, id = "seurat_clusters")
saveRDS(seurat_integrated, paste0(wd$output,"seurat_integrated_twobraingroups.RDS"))
Idents(seurat_integrated) <- "group.id"
groups <- unique(Idents(seurat_integrated)) # -only two groups, WT KO
pairwise <- combn(groups, 2)
results <- list()
# one comparison group WT and KO that takes 25 min
for(i in 1:ncol(pairwise)) {
markers <- FindMarkers(seurat_integrated, ident.1 = pairwise[1, i], ident.2 = pairwise[2, i], logfc.threshold = 0)
comparisons <- pairwise[, i]
markers$comparison <- paste(comparisons[1], comparisons[2], sep = '_')
results[[i]] <- markers
}
results.df <- do.call(rbind, results)
saveRDS(results.df, paste0(wd$output,"between_group_comp.rds"))
Idents(seurat_integrated)<- "group.id"
clusters <- levels(seurat_integrated$celltype_LC)
groups <- unique(Idents(seurat_integrated))
pairwise <- combn(groups, 2)
full_results<-data.frame()
for (k in 1:length(clusters)){
results <- list()
seurat_integrated.subset <- subset(seurat_integrated, subset= celltype_LC == clusters[k])
for(i in 1:ncol(pairwise)) {
markers <- FindMarkers(seurat_integrated.subset, ident.1 = pairwise[1, i], ident.2 = pairwise[2, i], logfc.threshold = 0)
comparisons <- pairwise[, i]
markers$comparison <- paste(comparisons[1], comparisons[2], sep = '_')
markers$genes<- rownames(markers)
results[[i]] <- markers
}
results.df <- do.call(rbind, results)
results.df$Cluster<- clusters[k]
full_results<-rbind(full_results,results.df)
}
# full_results
saveRDS(full_results, paste0(wd$output,"full_results.rds"))
write.csv(full_results, paste0(wd$output,"full_results_marker_comp_by_clusters.csv"))
Refer to Tasic et al, Nature 2018, marker list # https://github.com/AllenInstitute/tasic2018analysis/blob/master/RNA-seq%20Analysis/markers.R
Inh.markers <- list(c("Gad2","Slc32a1","Prox1","Adarb2","Nfix","Nfib","Cacna2d1",
"Cxcl14","Tnfaip8l3","Cplx3","Lamp5","Cd34","Pax6","Krt73",
"Scrg1","Egln3","Ndnf","Tmem182","Ntn1","Pde11a","Pdlim5",
"Lsp1","Slc35d3","Nkx2-1","Serpinf1","Col14a1","Vip","Sncg",
"Crabp1","Slc10a4","Cldn10","Bhlhe22","Crispld2","Slc17a8",
"Cyb5r2","Nr1h4","Wnt7b","Prss12","Igfbp6","Calb2","Grpr",
"Pthlh","Elfn1","Rspo1","Slc18a3","Lmo1","Rspo4","Sostdc1",
"Chat","Cbln4","Gsx2","Gpc3","Mab21l1","C1ql1","Itih5","Mybpc1",
"Myl1","Lhx6","Sox6","Sst","Chodl","Calb1","Cbln4","Etv1","Edn1",
"Kl","Il1rapl2","Myh8","Ptprk","Chrna2","Myh13","Ptgdr","Crhr2",
"Hpse","Igsf9","C1ql3","Tacstd2","Th","Col6a1","Nts","Tac1","Pvalb",
"Gabrg1","Il7","Bche","Prdm8","Syt2","Ostn","Pdlim3","C1ql1",
"Gpr149","Vipr2","Meis2","Adamts19","Cpa6","Lgr6"))
Inh.comb.markers <- c("Reln","Cnr1","Nr2f2","Cck","Npy","Crh","Tac2")
Ex.markers <- list(unique(c("Slc17a7","Rtn4rl2","Slc30a3","Cux2","Stard8","Otof","Rrad",
"Penk","Agmat",
"Emx2","S100a3","Macc1","Rorb","Scnn1a","Whrn","Endou","Col26a1",
"Rspo1","Fezf2","Hsd11b1","Batf3","Arhgap25","Colq","Pld5","Olfr78",
"Tcap","Fgf17","Wfdc18","Wfdc17","Aldh1a7","Tgfb1","Ctsc","Rxfp2",
"Prss35","Rgs12","Osr1","Oprk1","Cd52","Col23a1","Col18a1","Car1",
"Car3","Fam84b","Chrna6","Chrnb3","Fn1","Tac1","Lce3c","Erg",
"Cdc42ep5","Bmp5","Pvalb","Depdc7","Stac","C1ql2","Ptgfr","Slco2a1",
"Pappa2","Dppa1","Npsr1","Htr2c","Hpgd","Nxph3","Sla2","Tshz2",
"Rapgef3","Slc17a8","Trh","Nxph2","Foxp2","Col12a1","Syt6","Col5a1",
"Gpr139","Ly6d","Sla","Cpa6","Ppp1r18","Faim3","Ctxn3","Nxph4",
"Cplx3","Ctgf","Col8a1","Mup5","Ngf","Fam150a","F2r","Serpinb11","Fbxl7",
"P2ry12","Crh","Kynu","Hsd17b2","Mup3","Tlcd1","Lhx5","Trp73","Cpa6",
"Gkn1","Col18a1","Lce3c","Erg","Bmp5","Stac","C1ql2","Slco2a1","Lrrc9",
"Trhr","Myzap","Krt80","H60b","Fam150a","Clic5","Kcnj5","Olfr110",
"Olfr111")))
Ex.comb.markers <- c("Reln","Cdh13","Cpne7","Alcam","Rprm","Marcksl1")
Global.markers <- list(c("Fez1","Phyhipl","Aplp1","Gnao1","Caly","Snap25","Atp1a3","Camk2b",
"Syt1","Gabrg2","Fabp3","Stmn2","Kif5c","Slc32a1","Gad2","Dlx1","Dlx5",
"Dlx2","Dlx6os1","Slc6a1","Sox2","Slc17a7","Nrn1","Neurod2","Sv2b","Satb2",
"Tbr1","Vsig2","Cmtm5","Kcnj10","S100a16","S100a13","S1pr1","Gja1","Gjb6",
"Aqp4","Lcat","Acsbg1","Olig1","Sox10","Neu4","Sapcd2","Gpr17","Plp1",
"Cldn11","Mag","Mog","Nkx6-2","Enpp6","9630013A20Rik","Brca1",
"Mog","Opalin","Gjb1","Hapln2","Cyba","Ctsh","Ifitm3","Sparc",
"S100a11","Dcn","Col1a1","Pltp","Vtn","Slc6a13","Spp1","Slc13a3",
"Col15a1","Slc47a1","Tgtp2","Ifi47","Esam","Slco1a4","Slc38a5",
"Cldn5","H2-Q7","Slc38a11","Art3","Ace2","Acta2","Myh11","Pln",
"Gja5","Kcnj8","Atp13a5","Aoc3","Ctss","C1qb","C1qc","C1qa","Cbr2",
"F13a1","Pf4","Mrc1","Siglech","Selplg"))
seurat_integrated <- AddModuleScore(object = seurat_integrated,
features = Ex.markers,
name = "Excitatory_marker_score")
## Warning: The following features are not present in the object: Rtn4rl2, Agmat,
## Macc1, Scnn1a, Col26a1, Arhgap25, Colq, Olfr78, Fgf17, Wfdc17, Aldh1a7, Rxfp2,
## Prss35, Car1, Car3, Fam84b, Chrna6, Chrnb3, Lce3c, Cdc42ep5, Depdc7, Stac,
## Ptgfr, Slco2a1, Pappa2, Dppa1, Npsr1, Sla2, Trh, Nxph2, Col5a1, Gpr139, Cpa6,
## Faim3, Ctxn3, Nxph4, Col8a1, Mup5, Fam150a, F2r, Serpinb11, Fbxl7, Kynu,
## Hsd17b2, Mup3, Lhx5, Trp73, Gkn1, Lrrc9, Myzap, Krt80, H60b, Kcnj5, Olfr110,
## Olfr111, not searching for symbol synonyms
FeaturePlot(object = seurat_integrated, features = "Excitatory_marker_score1", label = T) +
ggtitle("FeaturePlot_Excitatory marker score (refer to Tasic et al., 2018, Nature)")
seurat_integrated <- AddModuleScore(object = seurat_integrated,
features = Inh.markers,
name = "Inhibitory_marker_score")
## Warning: The following features are not present in the object: Nfix, Tnfaip8l3,
## Krt73, Tmem182, Slc35d3, Nkx2-1, Crabp1, Slc10a4, Cyb5r2, Nr1h4, Prss12, Grpr,
## Slc18a3, Rspo4, Sostdc1, Chat, Gsx2, Mab21l1, Mybpc1, Myl1, Chodl, Kl, Il1rapl2,
## Myh8, Ptprk, Chrna2, Myh13, Ptgdr, Crhr2, Hpse, Igsf9, Tacstd2, Th, Il7, Prdm8,
## Ostn, Gpr149, Vipr2, Adamts19, Cpa6, not searching for symbol synonyms
FeaturePlot(object = seurat_integrated, features = "Inhibitory_marker_score1", label = T) +
ggtitle("FeaturePlot_Inhibitory marker score (refer to Tasic et al., 2018, Nature)")
seurat_integrated <- AddModuleScore(object = seurat_integrated,
features = Global.markers,
name = "Global_marker_score")
## Warning: The following features are not present in the object: Gnao1, Camk2b,
## Fabp3, Kif5c, Dlx5, Satb2, Vsig2, 9630013A20Rik, Slc47a1, Tgtp2, Ifi47, Gja5,
## Aoc3, Cbr2, F13a1, Pf4, Mrc1, not searching for symbol synonyms
FeaturePlot(object = seurat_integrated, features = "Global_marker_score1", label = T) +
ggtitle("FeaturePlot_Global marker score (refer to Tasic et al., 2018, Nature)")
saveRDS(seurat_integrated, paste0(wd$output,"seurat_integrated_twobraingroups.RDS"))
Qiu.ab<- subset(seurat_integrated, subset = group.id %in% c("WT", "KO"))
ab_markers<- subset(full_results, subset = comparison == "WT_KO")
ab_markers<- subset(ab_markers, subset = p_val_adj < 0.005)
ab_markers<- ab_markers[order(-abs(ab_markers$avg_log2FC)),]
write.csv(ab_markers, paste0(wd$output,"sorted_ab_marker_all_clusters.csv"))
##
num_markers<- as.data.frame(table(ab_markers$Cluster))
num_markers<- num_markers[order(num_markers$Freq),]
p<-ggplot(data=num_markers, aes(x=Var1, y=Freq, fill=Var1)) +
geom_bar(stat="identity")+
theme_minimal()+
coord_flip()+
labs(title="Number of Differentially Expressed Genes by Cluster in AB", y="Number of Differentially Expressed Genes", x="Cell Type")+
theme(legend.position="none")
p
##
Idents(Qiu.ab)<-'celltype_LC'
features<- unique(ab_markers$genes[1:11])
DotPlot(Qiu.ab, features = features) + RotatedAxis()
# cluster specific comparison and violin plot
ab_markers_GluN <- subset(ab_markers, subset = ab_markers$p_val_adj < 0.005& ab_markers$Cluster =="Neurons(Glutamatergic)")
ab_markers_GluN<- ab_markers_GluN[order(-abs(ab_markers_GluN$avg_log2FC)),]
head(ab_markers_GluN,20)
## p_val avg_log2FC pct.1 pct.2 p_val_adj comparison genes
## Prrg1 9.466617e-08 -5.228159 0.117 0.047 2.839985e-04 WT_KO Prrg1
## Pdlim3 4.729550e-09 -5.200669 0.182 0.018 1.418865e-05 WT_KO Pdlim3
## Tagln 6.255062e-29 4.970390 0.107 0.028 1.876519e-25 WT_KO Tagln
## Klk6 9.132891e-42 -4.666907 0.236 0.034 2.739867e-38 WT_KO Klk6
## Plin4 2.272906e-28 -4.563661 0.159 0.011 6.818717e-25 WT_KO Plin4
## Socs3 6.082897e-07 -4.499833 0.143 0.154 1.824869e-03 WT_KO Socs3
## Fbxl22 1.425446e-139 -4.479195 0.425 0.078 4.276337e-136 WT_KO Fbxl22
## Lamb1 6.690106e-08 -4.451595 0.173 0.071 2.007032e-04 WT_KO Lamb1
## Sox8 6.142291e-16 -4.419788 0.137 0.088 1.842687e-12 WT_KO Sox8
## Mustn1 1.831936e-60 -4.376585 0.254 0.146 5.495808e-57 WT_KO Mustn1
## Emx2os 4.476503e-22 -4.374314 0.203 0.096 1.342951e-18 WT_KO Emx2os
## Fbln5 7.626148e-07 -4.331924 0.103 0.024 2.287844e-03 WT_KO Fbln5
## Bicc1 1.033198e-09 -4.306721 0.208 0.110 3.099593e-06 WT_KO Bicc1
## Wnt3 2.434513e-15 -4.277500 0.137 0.034 7.303538e-12 WT_KO Wnt3
## Lcp2 7.540279e-12 -4.257663 0.205 0.163 2.262084e-08 WT_KO Lcp2
## Gm15440 2.556962e-11 4.170513 0.227 0.208 7.670886e-08 WT_KO Gm15440
## Slc9a2 1.903271e-16 -4.170395 0.128 0.035 5.709813e-13 WT_KO Slc9a2
## Tox3 1.358598e-11 4.167284 0.130 0.106 4.075794e-08 WT_KO Tox3
## Ugdh 3.897508e-08 -4.067804 0.164 0.155 1.169252e-04 WT_KO Ugdh
## Dlx2 4.111044e-17 4.063206 0.142 0.020 1.233313e-13 WT_KO Dlx2
## Cluster
## Prrg1 Neurons(Glutamatergic)
## Pdlim3 Neurons(Glutamatergic)
## Tagln Neurons(Glutamatergic)
## Klk6 Neurons(Glutamatergic)
## Plin4 Neurons(Glutamatergic)
## Socs3 Neurons(Glutamatergic)
## Fbxl22 Neurons(Glutamatergic)
## Lamb1 Neurons(Glutamatergic)
## Sox8 Neurons(Glutamatergic)
## Mustn1 Neurons(Glutamatergic)
## Emx2os Neurons(Glutamatergic)
## Fbln5 Neurons(Glutamatergic)
## Bicc1 Neurons(Glutamatergic)
## Wnt3 Neurons(Glutamatergic)
## Lcp2 Neurons(Glutamatergic)
## Gm15440 Neurons(Glutamatergic)
## Slc9a2 Neurons(Glutamatergic)
## Tox3 Neurons(Glutamatergic)
## Ugdh Neurons(Glutamatergic)
## Dlx2 Neurons(Glutamatergic)
Idents(seurat_integrated) <- seurat_integrated$integrated_snn_res.0.5
# Idents(seurat_integrated) <- "celltype_LC"
v1 <- VlnPlot(seurat_integrated, assay = "RNA", features = row.names(ab_markers_GluN)[1:15], stack = TRUE, flip = TRUE, split.by = "group.id", split.plot = TRUE)
v1
Idents(seurat_integrated) <- "celltype_LC"
v2 <- VlnPlot(seurat_integrated, assay = "RNA", features = c("Unc93b1", "Fscn1", "Enpp2", "Prrg1", "Cdkn1a", "Rfx4", "Gypc", "Ddah2", "Gad1", "Sst", "Cd83", "Lyz2", "Tcf4", "Egr1", "Sox8", "Foxp2", "Met", "Hgf", "Map2k1"), stack = TRUE, flip = TRUE, split.by = "group.id", split.plot = TRUE)
v2
Idents(seurat_integrated) <- seurat_integrated$group.id
v3 <- VlnPlot(seurat_integrated, assay = "RNA", features = c("Unc93b1", "Fscn1", "Enpp2", "Prrg1", "Cdkn1a", "Rfx4", "Gypc", "Ddah2", "Gad1", "Sst", "Cd83", "Lyz2", "Tcf4", "Egr1", "Sox8", "Foxp2", "Met", "Hgf", "Map2k1"), stack = TRUE, flip = TRUE)
v3
###########################
#v1 <- VlnPlot(Qiu.ab, features = features, stack = TRUE, flip = TRUE)
#ggsave("./v8_output/top_WT_KO_DEG.pdf", width=8, height = 10)
#rm(v1)
###########################
Idents(Qiu.ab)<-'celltype_LC'
#current.cluster.ids <- c("Glutamatergic","Astrocyte","Endothelial","Microglia","Oligodendrocyte Precursor","Gabaergic","Pericyte","Oligodendrocyte","Endothelial+Pericyte")
#new.cluster.ids <- c("Gl_N","A","EC","M","O PC","Gb_N","P","O","EC+P")
#Qiu.ab@active.ident <- plyr::mapvalues(x = Qiu.ab@active.ident, from = current.cluster.ids, to = new.cluster.ids)
#Qiu.ab@meta.data$cell.type.abbr <- Qiu.ab@active.ident
DoHeatmap(subset(Qiu.ab, downsample = 100), features = unique(ab_markers$genes)[1:100], size = 3)+
theme(axis.text.y = element_text(size = 5), text=element_text(size=20))
# library(monocle3)
# library(SeuratWrappers)
#data <- GetAssayData(seurat_integrated)[VariableFeatures(seurat_integrated),]
#pd <- data.frame(seurat_integrated@meta.data)
#fData <- data.frame(gene_short_name = rownames(data), row.names = row.names(data))
#cds <- new_cell_data_set(expression_data = data,
# cell_metadata = pd,
# gene_metadata = fData)
#cds <- preprocess_cds(cds, num_dim = 12,method = "PCA")
#plot_pc_variance_explained(cds)
#cds <- reduce_dimension(cds,
#preprocess_method = "PCA",
#reduction_method= "tSNE",
#verbose = T)
#cds <- cluster_cells(cds = cds , reduction_method = "tSNE", verbose = TRUE,
#resolution = NULL)
#p1 <- plot_cells(cds, reduction_method = "tSNE")
##compare with seurat data, res = 0.5
#cds@clusters$tSNE$clusters <- seurat_integrated$integrated_snn_res.0.5
#p2 <- plot_cells(cds, reduction_method = "tSNE")
#p1 + p2
#```{r} library(ShinyCell)
setwd(wd\(output) #setwd("D:/Bioinformatics Projects/LindseyAanalysis/single_cell/two_brain_groups/v8_output") # getExampleData() # Download example dataset (~200 MB) no need to run this if you run it before, with the .rds file in working directory seurat_integrated = readRDS(paste0(wd\)output,“seurat_integrated_twobraingroups.RDS”)) scConf = createConfig(seurat_integrated) makeShinyApp(seurat_integrated, scConf, gene.mapping = TRUE, shiny.title = “four sample_two brain groups”)
setwd(paste0(wd$output,“shinyApp”)) # verify change of dir in console using getwd()
shiny::runApp() # after you run this App, another R will be open, maximize it, then select and plot and test the graphic output. While this R window is open, the > ready in console will NOT be ready. You can also select output to browser. In order for the browser to work, you need to keep this new R window open. If you close the R window, the brower will gray out. ```